doctra 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +21 -18
- doctra/cli/main.py +3 -0
- doctra/engines/layout/paddle_layout.py +11 -77
- doctra/engines/vlm/provider.py +85 -85
- doctra/engines/vlm/service.py +6 -13
- doctra/exporters/html_writer.py +1235 -0
- doctra/parsers/structured_pdf_parser.py +12 -7
- doctra/parsers/table_chart_extractor.py +47 -22
- doctra/ui/__init__.py +5 -0
- doctra/ui/app.py +1012 -0
- doctra/utils/progress.py +200 -49
- doctra/utils/structured_utils.py +49 -49
- doctra/version.py +1 -1
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/METADATA +38 -1
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/RECORD +18 -15
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.2.0.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0
doctra/__init__.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
"""
|
2
|
-
Doctra - Document Parsing Library
|
3
|
-
Parse, extract, and analyze documents with ease
|
4
|
-
"""
|
5
|
-
|
6
|
-
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
-
from .parsers.table_chart_extractor import ChartTablePDFParser
|
8
|
-
from .version import __version__
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
'
|
13
|
-
'
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
1
|
+
"""
|
2
|
+
Doctra - Document Parsing Library
|
3
|
+
Parse, extract, and analyze documents with ease
|
4
|
+
"""
|
5
|
+
|
6
|
+
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
+
from .parsers.table_chart_extractor import ChartTablePDFParser
|
8
|
+
from .version import __version__
|
9
|
+
from .ui import build_demo, launch_ui
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
'StructuredPDFParser',
|
13
|
+
'ChartTablePDFParser',
|
14
|
+
'build_demo',
|
15
|
+
'launch_ui',
|
16
|
+
'__version__'
|
17
|
+
]
|
18
|
+
|
19
|
+
# Package metadata
|
20
|
+
__author__ = 'Adem Boukhris'
|
21
|
+
__email__ = 'boukhrisadam98@gmail.com' # Replace with your email
|
19
22
|
__description__ = 'Parse, extract, and analyze documents with ease'
|
doctra/cli/main.py
CHANGED
@@ -259,6 +259,7 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
259
259
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
260
260
|
parser.parse(str(pdf_path.absolute()))
|
261
261
|
click.echo("✅ Full document processing completed successfully!")
|
262
|
+
click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
|
262
263
|
|
263
264
|
except KeyboardInterrupt:
|
264
265
|
click.echo("\n⚠️ Processing interrupted by user", err=True)
|
@@ -444,6 +445,7 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
444
445
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
445
446
|
parser.parse(str(pdf_path), str(output_dir))
|
446
447
|
click.echo("✅ Table extraction completed successfully!")
|
448
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
447
449
|
|
448
450
|
except KeyboardInterrupt:
|
449
451
|
click.echo("\n⚠️ Extraction interrupted by user", err=True)
|
@@ -522,6 +524,7 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
522
524
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
523
525
|
parser.parse(str(pdf_path), str(output_dir))
|
524
526
|
click.echo("✅ Chart and table extraction completed successfully!")
|
527
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
525
528
|
|
526
529
|
except KeyboardInterrupt:
|
527
530
|
click.echo("\n⚠️ Extraction interrupted by user", err=True)
|
@@ -4,17 +4,15 @@ import os
|
|
4
4
|
import sys
|
5
5
|
import json
|
6
6
|
import tempfile
|
7
|
-
import logging
|
8
7
|
from dataclasses import dataclass, asdict
|
9
8
|
from typing import Dict, List, Any, Tuple, Optional
|
10
|
-
from tqdm import tqdm
|
11
9
|
|
12
10
|
from PIL import Image
|
13
11
|
from paddleocr import LayoutDetection # pip install paddleocr>=2.7.0.3
|
14
12
|
from doctra.utils.pdf_io import render_pdf_to_images
|
15
13
|
from doctra.engines.layout.layout_models import LayoutBox, LayoutPage
|
16
|
-
from doctra.utils.quiet import suppress_output
|
17
14
|
from doctra.utils.progress import create_loading_bar
|
15
|
+
import warnings
|
18
16
|
|
19
17
|
|
20
18
|
class PaddleLayoutEngine:
|
@@ -40,7 +38,7 @@ class PaddleLayoutEngine:
|
|
40
38
|
(default: "PP-DocLayout_plus-L")
|
41
39
|
"""
|
42
40
|
self.model_name = model_name
|
43
|
-
self.model: Optional[LayoutDetection] = None
|
41
|
+
self.model: Optional["LayoutDetection"] = None
|
44
42
|
|
45
43
|
def _ensure_model(self) -> None:
|
46
44
|
"""
|
@@ -54,80 +52,16 @@ class PaddleLayoutEngine:
|
|
54
52
|
if self.model is not None:
|
55
53
|
return
|
56
54
|
|
57
|
-
# Beautiful loading progress bar
|
55
|
+
# Beautiful loading progress bar (no logging suppression)
|
58
56
|
with create_loading_bar(f'Loading PaddleOCR layout model: "{self.model_name}"') as bar:
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
original_tqdm_init(self, *args, **kwargs)
|
68
|
-
|
69
|
-
def silent_update(self, *args, **kwargs):
|
70
|
-
pass # Do nothing
|
71
|
-
|
72
|
-
def silent_close(self, *args, **kwargs):
|
73
|
-
pass # Do nothing
|
74
|
-
|
75
|
-
# More comprehensive output suppression
|
76
|
-
# Save original logging levels
|
77
|
-
original_levels = {}
|
78
|
-
loggers_to_silence = ['ppocr', 'paddle', 'PIL', 'urllib3', 'requests']
|
79
|
-
for logger_name in loggers_to_silence:
|
80
|
-
logger = logging.getLogger(logger_name)
|
81
|
-
original_levels[logger_name] = logger.level
|
82
|
-
logger.setLevel(logging.CRITICAL)
|
83
|
-
|
84
|
-
# Also try to silence the root logger temporarily
|
85
|
-
root_logger = logging.getLogger()
|
86
|
-
original_root_level = root_logger.level
|
87
|
-
root_logger.setLevel(logging.CRITICAL)
|
88
|
-
|
89
|
-
# Set environment variables that might help silence PaddlePaddle
|
90
|
-
old_env = {}
|
91
|
-
env_vars_to_set = {
|
92
|
-
'FLAGS_print_model_stats': '0',
|
93
|
-
'FLAGS_enable_parallel_graph': '0',
|
94
|
-
'GLOG_v': '4', # Only show fatal errors
|
95
|
-
'GLOG_logtostderr': '0',
|
96
|
-
'GLOG_alsologtostderr': '0'
|
97
|
-
}
|
98
|
-
|
99
|
-
for key, value in env_vars_to_set.items():
|
100
|
-
old_env[key] = os.environ.get(key)
|
101
|
-
os.environ[key] = value
|
102
|
-
|
103
|
-
try:
|
104
|
-
# Monkey patch tqdm
|
105
|
-
tqdm.__init__ = silent_init
|
106
|
-
tqdm.update = silent_update
|
107
|
-
tqdm.close = silent_close
|
108
|
-
|
109
|
-
# Silence Paddle's download/init noise with enhanced suppression
|
110
|
-
with suppress_output():
|
111
|
-
self.model = LayoutDetection(model_name=self.model_name)
|
112
|
-
|
113
|
-
finally:
|
114
|
-
# Restore tqdm methods
|
115
|
-
tqdm.__init__ = original_tqdm_init
|
116
|
-
tqdm.update = original_tqdm_update
|
117
|
-
tqdm.close = original_tqdm_close
|
118
|
-
|
119
|
-
# Restore logging levels
|
120
|
-
for logger_name, level in original_levels.items():
|
121
|
-
logging.getLogger(logger_name).setLevel(level)
|
122
|
-
root_logger.setLevel(original_root_level)
|
123
|
-
|
124
|
-
# Restore environment variables
|
125
|
-
for key, old_value in old_env.items():
|
126
|
-
if old_value is None:
|
127
|
-
os.environ.pop(key, None)
|
128
|
-
else:
|
129
|
-
os.environ[key] = old_value
|
130
|
-
|
57
|
+
# Suppress specific paddle extension warning: "No ccache found"
|
58
|
+
with warnings.catch_warnings():
|
59
|
+
warnings.filterwarnings(
|
60
|
+
"ignore",
|
61
|
+
message=r"No ccache found.*",
|
62
|
+
category=UserWarning,
|
63
|
+
)
|
64
|
+
self.model = LayoutDetection(model_name=self.model_name)
|
131
65
|
bar.update(1)
|
132
66
|
|
133
67
|
def predict_pdf(
|
doctra/engines/vlm/provider.py
CHANGED
@@ -1,86 +1,86 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
# --- keep these imports to match your snippet style ---
|
4
|
-
import io
|
5
|
-
import PIL
|
6
|
-
import openai
|
7
|
-
import outlines
|
8
|
-
from pydantic import BaseModel
|
9
|
-
from google.genai import Client
|
10
|
-
from outlines.inputs import Image
|
11
|
-
from anthropic import Anthropic
|
12
|
-
# ------------------------------------------------------
|
13
|
-
|
14
|
-
def make_model(
|
15
|
-
vlm_provider: str | None = "gemini",
|
16
|
-
vlm_model: str | None = None,
|
17
|
-
*,
|
18
|
-
api_key: str | None = None,
|
19
|
-
):
|
20
|
-
"""
|
21
|
-
Build a callable Outlines model for VLM processing.
|
22
|
-
|
23
|
-
Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, or OpenRouter
|
24
|
-
providers. Only one backend is active at a time, with Gemini as the default.
|
25
|
-
|
26
|
-
:param vlm_provider: VLM provider to use ("gemini", "openai", or "anthropic", default: "gemini")
|
27
|
-
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
28
|
-
:param api_key: API key for the VLM provider (required for all providers)
|
29
|
-
:return: Configured Outlines model instance
|
30
|
-
:raises ValueError: If provider is unsupported or API key is missing
|
31
|
-
"""
|
32
|
-
vlm_provider = (vlm_provider or "gemini").lower()
|
33
|
-
|
34
|
-
# Set default models if not provided
|
35
|
-
if vlm_model is None:
|
36
|
-
if vlm_provider == "gemini":
|
37
|
-
vlm_model = "gemini-2.5-pro"
|
38
|
-
elif vlm_provider == "openai":
|
39
|
-
vlm_model = "gpt-5"
|
40
|
-
elif vlm_provider == "anthropic":
|
41
|
-
vlm_model = "claude-opus-4-1"
|
42
|
-
elif vlm_provider == "openrouter":
|
43
|
-
vlm_model = "x-ai/grok-4"
|
44
|
-
|
45
|
-
if vlm_provider == "gemini":
|
46
|
-
if not api_key:
|
47
|
-
raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
|
48
|
-
# Create the model (exactly like your snippet)
|
49
|
-
return outlines.from_gemini(
|
50
|
-
Client(api_key=api_key),
|
51
|
-
vlm_model,
|
52
|
-
)
|
53
|
-
|
54
|
-
if vlm_provider == "openai":
|
55
|
-
if not api_key:
|
56
|
-
raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
|
57
|
-
# this part is for the openai models (exactly like your snippet)
|
58
|
-
return outlines.from_openai(
|
59
|
-
openai.OpenAI(api_key=api_key),
|
60
|
-
vlm_model,
|
61
|
-
)
|
62
|
-
|
63
|
-
if vlm_provider == "anthropic":
|
64
|
-
if not api_key:
|
65
|
-
raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
|
66
|
-
# Create the Anthropic client and model (exactly like your snippet)
|
67
|
-
client = Anthropic(api_key=api_key)
|
68
|
-
return outlines.from_anthropic(
|
69
|
-
client,
|
70
|
-
vlm_model,
|
71
|
-
)
|
72
|
-
|
73
|
-
if vlm_provider == "openrouter":
|
74
|
-
if not api_key:
|
75
|
-
raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
|
76
|
-
# Create the Anthropic client and model (exactly like your snippet)
|
77
|
-
client = openai.OpenAI(
|
78
|
-
base_url="https://openrouter.ai/api/v1",
|
79
|
-
api_key=api_key,
|
80
|
-
)
|
81
|
-
return outlines.from_openai(
|
82
|
-
client,
|
83
|
-
vlm_model
|
84
|
-
)
|
85
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
# --- keep these imports to match your snippet style ---
|
4
|
+
import io
|
5
|
+
import PIL
|
6
|
+
import openai
|
7
|
+
import outlines
|
8
|
+
from pydantic import BaseModel
|
9
|
+
from google.genai import Client
|
10
|
+
from outlines.inputs import Image
|
11
|
+
from anthropic import Anthropic
|
12
|
+
# ------------------------------------------------------
|
13
|
+
|
14
|
+
def make_model(
|
15
|
+
vlm_provider: str | None = "gemini",
|
16
|
+
vlm_model: str | None = None,
|
17
|
+
*,
|
18
|
+
api_key: str | None = None,
|
19
|
+
):
|
20
|
+
"""
|
21
|
+
Build a callable Outlines model for VLM processing.
|
22
|
+
|
23
|
+
Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, or OpenRouter
|
24
|
+
providers. Only one backend is active at a time, with Gemini as the default.
|
25
|
+
|
26
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", or "anthropic", default: "gemini")
|
27
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
28
|
+
:param api_key: API key for the VLM provider (required for all providers)
|
29
|
+
:return: Configured Outlines model instance
|
30
|
+
:raises ValueError: If provider is unsupported or API key is missing
|
31
|
+
"""
|
32
|
+
vlm_provider = (vlm_provider or "gemini").lower()
|
33
|
+
|
34
|
+
# Set default models if not provided
|
35
|
+
if vlm_model is None:
|
36
|
+
if vlm_provider == "gemini":
|
37
|
+
vlm_model = "gemini-2.5-pro"
|
38
|
+
elif vlm_provider == "openai":
|
39
|
+
vlm_model = "gpt-5"
|
40
|
+
elif vlm_provider == "anthropic":
|
41
|
+
vlm_model = "claude-opus-4-1"
|
42
|
+
elif vlm_provider == "openrouter":
|
43
|
+
vlm_model = "x-ai/grok-4"
|
44
|
+
|
45
|
+
if vlm_provider == "gemini":
|
46
|
+
if not api_key:
|
47
|
+
raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
|
48
|
+
# Create the model (exactly like your snippet)
|
49
|
+
return outlines.from_gemini(
|
50
|
+
Client(api_key=api_key),
|
51
|
+
vlm_model,
|
52
|
+
)
|
53
|
+
|
54
|
+
if vlm_provider == "openai":
|
55
|
+
if not api_key:
|
56
|
+
raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
|
57
|
+
# this part is for the openai models (exactly like your snippet)
|
58
|
+
return outlines.from_openai(
|
59
|
+
openai.OpenAI(api_key=api_key),
|
60
|
+
vlm_model,
|
61
|
+
)
|
62
|
+
|
63
|
+
if vlm_provider == "anthropic":
|
64
|
+
if not api_key:
|
65
|
+
raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
|
66
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
67
|
+
client = Anthropic(api_key=api_key)
|
68
|
+
return outlines.from_anthropic(
|
69
|
+
client,
|
70
|
+
vlm_model,
|
71
|
+
)
|
72
|
+
|
73
|
+
if vlm_provider == "openrouter":
|
74
|
+
if not api_key:
|
75
|
+
raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
|
76
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
77
|
+
client = openai.OpenAI(
|
78
|
+
base_url="https://openrouter.ai/api/v1",
|
79
|
+
api_key=api_key,
|
80
|
+
)
|
81
|
+
return outlines.from_openai(
|
82
|
+
client,
|
83
|
+
vlm_model
|
84
|
+
)
|
85
|
+
|
86
86
|
raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', or 'anthropic'.")
|
doctra/engines/vlm/service.py
CHANGED
@@ -15,12 +15,12 @@ class VLMStructuredExtractor:
|
|
15
15
|
from images using Vision Language Models (VLM) with Outlines for type safety.
|
16
16
|
|
17
17
|
Usage:
|
18
|
-
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY"
|
18
|
+
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY")
|
19
19
|
chart = vlm.extract_chart("/abs/path/chart.jpg")
|
20
20
|
table = vlm.extract_table("/abs/path/table.jpg")
|
21
21
|
|
22
22
|
# Or with Anthropic:
|
23
|
-
vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY"
|
23
|
+
vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY")
|
24
24
|
"""
|
25
25
|
|
26
26
|
def __init__(
|
@@ -29,25 +29,21 @@ class VLMStructuredExtractor:
|
|
29
29
|
vlm_model: str | None = None,
|
30
30
|
*,
|
31
31
|
api_key: str | None = None,
|
32
|
-
debug: bool = True,
|
33
32
|
):
|
34
33
|
"""
|
35
34
|
Initialize the VLMStructuredExtractor with provider configuration.
|
36
35
|
|
37
|
-
Sets up the VLM model
|
38
|
-
from images.
|
36
|
+
Sets up the VLM model for structured data extraction from images.
|
39
37
|
|
40
38
|
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
41
39
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
42
40
|
:param api_key: API key for the VLM provider (required for all providers)
|
43
|
-
:param debug: Whether to enable debug output for error handling (default: True)
|
44
41
|
"""
|
45
42
|
self.model = make_model(
|
46
43
|
vlm_provider,
|
47
44
|
vlm_model,
|
48
45
|
api_key=api_key,
|
49
46
|
)
|
50
|
-
self.debug = debug
|
51
47
|
|
52
48
|
def _call(self, prompt_text: str, image_path: str, schema):
|
53
49
|
"""
|
@@ -71,13 +67,10 @@ class VLMStructuredExtractor:
|
|
71
67
|
img = img.convert("RGB")
|
72
68
|
|
73
69
|
prompt = [prompt_text, Image(img)]
|
74
|
-
|
70
|
+
result = self.model(prompt, schema)
|
71
|
+
|
72
|
+
return result
|
75
73
|
except Exception as e:
|
76
|
-
if self.debug:
|
77
|
-
import traceback
|
78
|
-
print(f"[VLM ERROR] while processing: {image_path}")
|
79
|
-
traceback.print_exc()
|
80
|
-
print(f"[VLM ERROR] type={type(e).__name__} msg={e}")
|
81
74
|
# Re-raise so caller can handle/log too
|
82
75
|
raise
|
83
76
|
|